# import numpy as np
# import collections

# word_count_file_path = '../../../data/stackoverflow/datasets/stackoverflow.word_count'
# word_dict = None
# word_list = None
# _pad = '<pad>'
# _bos = '<bos>'
# _eos = '<eos>'
# '''
# This code follows the steps of preprocessing in tff stackoverflow dataset:
# https://github.com/google-research/federated/blob/master/utils/datasets/shakespeare_dataset.py
# '''

# SEQUENCE_LENGTH = 80  # from McMahan et al AISTATS 2017
# # Vocabulary re-used from the Federated Learning for Text Generation tutorial.
# # https://www.tensorflow.org/federated/tutorials/federated_learning_for_text_generation
# CHAR_VOCAB = list(
#     'dhlptx@DHLPTX $(,048cgkoswCGKOSW[_#\'/37;?bfjnrvzBFJNRVZ"&*.26:\naeimquyAEIMQUY]!%)-159\r'
# )


# def get_word_dict():
#     global word_dict
#     if word_dict == None:
#         words = [_pad] + CHAR_VOCAB + [_bos] + [_eos]
#         word_dict = collections.OrderedDict()
#         for i, w in enumerate(words):
#             word_dict[w] = i
#     return word_dict


# def get_word_list():
#     global word_list
#     if word_list == None:
#         word_dict = get_word_dict()
#         word_list = list(word_dict.keys())
#     return word_list


# def id_to_word(idx):
#     return get_word_list()[idx]


# def char_to_id(char):
#     word_dict = get_word_dict()
#     if char in word_dict:
#         return word_dict[char]
#     else:
#         return len(word_dict)


# def preprocess(sentences, max_seq_len=SEQUENCE_LENGTH):

#     sequences = []

#     def to_ids(sentence, num_oov_buckets=1):
#         '''
#         map list of sentence to list of [idx..] and pad to max_seq_len + 1
#         Args:
#             num_oov_buckets : The number of out of vocabulary buckets.
#             max_seq_len: Integer determining shape of padded batches.
#         '''
#         tokens = [char_to_id(c) for c in sentence]
#         tokens = [char_to_id(_bos)] + tokens + [char_to_id(_eos)]
#         if len(tokens) % (max_seq_len + 1) != 0:
#             pad_length = (-len(tokens)) % (max_seq_len + 1)
#             tokens += [char_to_id(_pad)] * pad_length
#         return (tokens[i:i + max_seq_len + 1]
#                 for i in range(0, len(tokens), max_seq_len + 1))

#     for sen in sentences:
#         sequences.extend(to_ids(sen))
#     return sequences


# def split(dataset):
#     ds = np.asarray(dataset)
#     x = ds[:, :-1]
#     y = ds[:, -1]
#     return x, y


# if __name__ == "__main__":
#     print(
#         split(
#             preprocess([
#                 'Yonder comes my master, your brother.',
#                 'Come not within these doors; within this roof\nThe enemy of all your graces lives.\nYour brother- no, no brother; yet the son-\nYet not the son; I will not call him son\nOf him I was about to call his father-\nHath heard your praises; and this night he means\nTo burn the lodging where you use to lie,\nAnd you within it. If he fail of that,\nHe will have other means to cut you off;\nI overheard him and his practices.\nThis is no place; this house is but a butchery;\nAbhor it, fear it, do not enter it.\nNo matter whither, so you come not here.',
#                 "To the last gasp, with truth and loyalty.\nFrom seventeen years till now almost four-score\nHere lived I, but now live here no more.\nAt seventeen years many their fortunes seek,\nBut at fourscore it is too late a week;\nYet fortune cannot recompense me better\nThan to die well and not my master's debtor.          Exeunt\nDear master, I can go no further. O, I die for food! Here lie",
#                 "[Coming forward] Sweet masters, be patient; for your father's",
#                 "remembrance, be at accord.\nIs 'old dog' my reward? Most true, I have lost my teeth in",
#             ])))
